library(tidyverse)
library(janitor)
library(readxl)Cleaning TEA Salaries
Setup
Importing the libraries we need.
Import all the salary files
salary_files_list <- list.files(
"data-raw",
pattern = ".csv",
full.names = T
)
salary_files_list [1] "data-raw/Staff Salary FTE Report_Statewide_Districts_2014-2015.csv"
[2] "data-raw/Staff Salary FTE Report_Statewide_Districts_2015-2016.csv"
[3] "data-raw/Staff Salary FTE Report_Statewide_Districts_2016-2017.csv"
[4] "data-raw/Staff Salary FTE Report_Statewide_Districts_2017-2018.csv"
[5] "data-raw/Staff Salary FTE Report_Statewide_Districts_2018-2019.csv"
[6] "data-raw/Staff Salary FTE Report_Statewide_Districts_2019-2020.csv"
[7] "data-raw/Staff Salary FTE Report_Statewide_Districts_2020-2021.csv"
[8] "data-raw/Staff Salary FTE Report_Statewide_Districts_2021-2022.csv"
[9] "data-raw/Staff Salary FTE Report_Statewide_Districts_2022-2023.csv"
[10] "data-raw/Staff Salary FTE Report_Statewide_Districts_2023-2024.csv"
import_files <- function(file_name) {
df <- file_name |>
read_csv( #read csv first and skip the TEA heading lines
skip = 5) |>
clean_names() |>
mutate(fte_count = as.numeric(fte_count),
total_base_pay = as.numeric(total_base_pay),
average_base_pay = as.numeric(average_base_pay))
}
tea_salaries <- salary_files_list |> set_names(basename) |>
map(import_files) |> #map on our new function
list_rbind(names_to = "source")Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 39350 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 39096 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 39429 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 51617 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 52126 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 52224 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 52500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 53092 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Region, County, District, District Name, Charter Status, Staff_Cat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `fte_count = as.numeric(fte_count)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 53521 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 53839 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Region, County, District, District Name, Charter Status, Staff_Cate...
dbl (3): FTE Count, Total Base Pay, Average Base Pay
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tea_salariesI know that the end of each file has weird extra lines with extra text. Let’s remove all of those lines that don’t have an actual region. Then we check that the files combined correctly by looking at the end of the datatset
tea_salaries <- tea_salaries |> filter(str_length(region) <= 2)
tea_salaries |> tail(50)Another check that the files combined correctly.
tea_salaries |> group_by(district) |>
summarize(total = n())Another check that the files combined correctly.
tea_salaries$staff_category |> unique()[1] "TEACHING STAFF" "SUPPORT STAFF" "ADMINISTRATIVE STAFF"
[4] NA "PARAPROFESSIONAL STAFF" "AUXILIARY"
[7] "AUXILIARY STAFF" "TRADITIONAL ISD/CSD"
“TRADITIONAL ISD/CSD” shouldn’t be in there. Let’s see where that error is occurring.
tea_salaries |> filter(staff_category == "TRADITIONAL ISD/CSD")It’s only one row…and it looks like it might be missing data. Let’s look at all the rows for this district.
tea_salaries |> filter(district == "176901")These look normal…we only want staff_category that is “TEACHING STAFF” anyways so we can disregard that weird row since it has “ADMINISTRATIVE STAFF”.
Now we want to pull the school year from the file name (source column). Then we will get rid of that source column cause we don’t need it anymore and then I need ot factor the years so R understands that the strings have an order.
teaching_staff_salaries <- tea_salaries |> filter(staff_category == "TEACHING STAFF") |>
mutate(school_year = substr(source, start = (nchar(source) - 13 + 1), stop = nchar(source)-4))
teaching_staff_salaries <- teaching_staff_salaries |> select(-source) |>
mutate(
school_year = factor(school_year,
levels = c("2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024")))
colnames(teaching_staff_salaries)[colnames(teaching_staff_salaries) == "district"] <- "district_number"
teaching_staff_salariesDistrict Types and Join
Let’s read in the district types spreadsheet. We also want to clean the names and then change the first column to be district_name so it doesn’t conflict when we join it later.
district_types <- read_xlsx("data-raw/district-type2223.xlsx", sheet = "2223_Data") |> clean_names()
colnames(district_types)[colnames(district_types) == 'district'] <- 'district_name'
district_typesNow I am going to join the district type data frame with the district salary data.
salaries_district_types <- teaching_staff_salaries |> left_join(district_types, by = "district_number")
salaries_district_types salaries_district_types |> count(district_number, district_name.x, district_name.y) |> count(district_number, sort = T)salaries_district_types |> filter(district_number == "227824") salaries_district_types <- salaries_district_types |> mutate(
end_year = substr(school_year, start = 6, stop = 9)
)
salaries_district_typesExport
I want to export the above dataframe to use for analysis in my next notebook.
salaries_district_types |> write_rds("data-processed/teaching_staff_salaries.rds")